Libraries

Load data

I am using the AHA dataset for writing my code because it is the simplest dataset I can work with. Simplest = significant number of baseline samples without issue of repeated measures, balanced distribution of classes, likely difference to be detected present.

p_ps = readRDS("/Users/aa370/Library/CloudStorage/Box-Box/project_davidlab/LAD_LAB_Personnel/Ammara_A/Projects/POMMS/R24_POMMS/data/processed/20221104_pomms_metadata_plant.rds")

p_ps

sample_data(p_ps)

Preprocess

Remove problematic samples

Taking out samples with less than or equal to 0 reads and keeping only entry timepoint reduces samples from 384 down to 240.

p_ps = p_ps %>%
  subset_samples( reads > 0) %>%
  subset_samples(timepoint == "Entry")

p_ps

Class distribution in this dataset: Case = 193 Control = 47

sample_data(p_ps) %>%
  as.data.frame() %>%
  as_tibble() %>%
  group_by(group) %>%
  summarise(count = n())

Data transform

otu_clr = abundances(p_ps, "clr") %>% # rclr transform could not be performed, get Nans
  t()

p_ps = phyloseq(otu_table(otu_clr, taxa_are_rows = FALSE),
                   sample_data(sample_data(p_ps)),
                   tax_table(tax_table(p_ps)))

rm(otu_clr)

Feature table

Extract data

Order of asv colnames in OTU table and tax table is the same.

features = p_ps@otu_table %>%
  as.data.frame() %>%
  as_tibble(rownames = NA) %>%
  rownames_to_column("sample")

feature_labels = c("sample")

 names = tax_table(p_ps) %>%
      data.frame() %>%
      lowest_level() %>%
   pull(name)

 feature_labels = append(feature_labels, names)

 feature_labels = make.unique(feature_labels) %>%
   make.names()

 colnames(features) = feature_labels

features

Attach data labels

#Subsampling: SMOTE

Zero variance features

No features have zero variance

nzv = subsampled %>%
  select(!group) %>%
  nearZeroVar(saveMetrics = TRUE)
rm(nzv)

Correlated predictors

summary(feature_cor[upper.tri(feature_cor)])
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-0.52293 -0.04691  0.10074  0.20495  0.35340  1.00000 
high_cor
  [1]  89  97 102 105 119 139 141 148 151 153 155 156 176 185 186 207 208 216 220 221 222 225 231 236
 [25] 238 241 251  32  55  69  75  76  77  78  80  83  85  86  88  91  92  94  95  98 101 103 104 106
 [49] 107 108 109 110 115 117 118 125 126 130 131 132 133 135 138 140 142 143 146 149 150 154 157 158
 [73] 159 100 160 172 174 179  74 181 189 190 191 192 193 194 195 199 200 202 203 204 205 206 209 210
 [97] 212 213 214 215 218 219 223 224 226 227 228 229 230 232 233 235 240 242 244 245 247 249
rm(high_cor)
rm(feature_cor)

Linear dependencies

feature_ld = filtered_subsampled %>%
  select(!group) %>%
  findLinearCombos()

filtered_subsampled = filtered_subsampled[, - feature_ld$remove]

Input table

Preprocessing reduces column number to 102 from 235

Modeling

Load functions from classification_function_AA file

results
[[1]]

[[2]]
NA
rm(label_list)

Plotting

AUC plot and t-test

auc_df %>%
  pivot_longer(cols = c("group", "group_shuffle"), names_to = "feature", values_to = "AUC") %>%
  ggplot()+
  geom_boxplot(aes(x=feature, y = AUC))+
  theme_classic()


t.test(auc_df$group, auc_df$group_shuffle)

    Welch Two Sample t-test

data:  auc_df$group and auc_df$group_shuffle
t = 21.57, df = 15.18, p-value = 8.37e-13
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 0.3677251 0.4482749
sample estimates:
mean of x mean of y 
    0.942     0.534 

Importance plots

look-up table

#Making asv reference for names in model
lookup_asv = tax_table(p_ps) %>% 
     data.frame() %>%
      lowest_level() %>%
   rownames_to_column("asv") %>% 
   select(asv, name) %>%
   mutate(name = make.names(make.unique(name)))

summarize importances

top 10 ranked list

# Getting the ranked list
ranking = importance_summary %>%
  filter(variable == "group_shuffle") %>%
  pull(food)

top10 = ranking[1:10]

Importance plot

importance_df %>%
  pivot_longer(cols = c("Helianthus.annuus":"Rheum.rhaponticum"), names_to = "food", values_to = "importance")  %>%
  filter(variable == "group_shuffle" & food %in% top10) %>%
  ggplot()+
  geom_boxplot(aes(x= reorder(food,-importance), y = importance))+
  labs(x= "Food", y ="Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust=1))

Some of the important NAs for predictions only map to bacterial species in BLAST.

Important taxa directions

lookup_asv %>%
  filter(name == "Theobroma.cacao") %>%
  pull(asv)
[1] "ATCCTATTATTTTATTATTTTACGAAACTAAACAAAGGTTCAGCAAGCGAGAATAATAAAAAAAG"
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKIyBMaWJyYXJpZXMKCmBgYHtyLCBpbmNsdWRlID0gRkFMU0V9CmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KG1pY3JvYmlvbWUpCmxpYnJhcnkocGh5bG9zZXEpCmxpYnJhcnkoY2FyZXQpCmxpYnJhcnkocmFuZG9tRm9yZXN0KQpsaWJyYXJ5KE1CdXRpbHMpCmxpYnJhcnkoTUxldmFsKQpsaWJyYXJ5KERNd1IpCmBgYAoKIyBMb2FkIGRhdGEKCkkgYW0gdXNpbmcgdGhlIEFIQSBkYXRhc2V0IGZvciB3cml0aW5nIG15IGNvZGUgYmVjYXVzZSBpdCBpcyB0aGUgc2ltcGxlc3QgZGF0YXNldCBJIGNhbiB3b3JrIHdpdGguClNpbXBsZXN0ID0gc2lnbmlmaWNhbnQgbnVtYmVyIG9mIGJhc2VsaW5lIHNhbXBsZXMgd2l0aG91dCBpc3N1ZSBvZiByZXBlYXRlZCBtZWFzdXJlcywgYmFsYW5jZWQgZGlzdHJpYnV0aW9uIG9mIGNsYXNzZXMsIGxpa2VseSBkaWZmZXJlbmNlIHRvIGJlIGRldGVjdGVkIHByZXNlbnQuCgoKYGBge3J9CnBfcHMgPSByZWFkUkRTKCIvVXNlcnMvYWEzNzAvTGlicmFyeS9DbG91ZFN0b3JhZ2UvQm94LUJveC9wcm9qZWN0X2RhdmlkbGFiL0xBRF9MQUJfUGVyc29ubmVsL0FtbWFyYV9BL1Byb2plY3RzL1BPTU1TL1IyNF9QT01NUy9kYXRhL3Byb2Nlc3NlZC8yMDIyMTEwNF9wb21tc19tZXRhZGF0YV9wbGFudC5yZHMiKQoKcF9wcwoKc2FtcGxlX2RhdGEocF9wcykKYGBgCgojIFByZXByb2Nlc3MKCiMjIFJlbW92ZSBwcm9ibGVtYXRpYyBzYW1wbGVzCgpUYWtpbmcgb3V0IHNhbXBsZXMgd2l0aCBsZXNzIHRoYW4gb3IgZXF1YWwgdG8gMCByZWFkcyBhbmQga2VlcGluZyBvbmx5IGVudHJ5IHRpbWVwb2ludCByZWR1Y2VzIHNhbXBsZXMgZnJvbSAzODQgZG93biB0byAyNDAuCgpgYGB7cn0KcF9wcyA9IHBfcHMgJT4lCiAgc3Vic2V0X3NhbXBsZXMoIHJlYWRzID4gMCkgJT4lCiAgc3Vic2V0X3NhbXBsZXModGltZXBvaW50ID09ICJFbnRyeSIpCgpwX3BzCmBgYAoKQ2xhc3MgZGlzdHJpYnV0aW9uIGluIHRoaXMgZGF0YXNldDoKQ2FzZSA9IDE5MwpDb250cm9sID0gNDcKCmBgYHtyfQpzYW1wbGVfZGF0YShwX3BzKSAlPiUKICBhcy5kYXRhLmZyYW1lKCkgJT4lCiAgYXNfdGliYmxlKCkgJT4lCiAgZ3JvdXBfYnkoZ3JvdXApICU+JQogIHN1bW1hcmlzZShjb3VudCA9IG4oKSkKYGBgCgojIyBEYXRhIHRyYW5zZm9ybQoKYGBge3J9Cm90dV9jbHIgPSBhYnVuZGFuY2VzKHBfcHMsICJjbHIiKSAlPiUgIyByY2xyIHRyYW5zZm9ybSBjb3VsZCBub3QgYmUgcGVyZm9ybWVkLCBnZXQgTmFucwogIHQoKQoKcF9wcyA9IHBoeWxvc2VxKG90dV90YWJsZShvdHVfY2xyLCB0YXhhX2FyZV9yb3dzID0gRkFMU0UpLAogICAgICAgICAgICAgICAgICAgc2FtcGxlX2RhdGEoc2FtcGxlX2RhdGEocF9wcykpLAogICAgICAgICAgICAgICAgICAgdGF4X3RhYmxlKHRheF90YWJsZShwX3BzKSkpCgpybShvdHVfY2xyKQpgYGAKCiMjIEZlYXR1cmUgdGFibGUKCiMjIyBFeHRyYWN0IGRhdGEKCk9yZGVyIG9mIGFzdiBjb2xuYW1lcyBpbiBPVFUgdGFibGUgYW5kIHRheCB0YWJsZSBpcyB0aGUgc2FtZS4KYGBge3J9CmZlYXR1cmVzID0gcF9wc0BvdHVfdGFibGUgJT4lCiAgYXMuZGF0YS5mcmFtZSgpICU+JQogIGFzX3RpYmJsZShyb3duYW1lcyA9IE5BKSAlPiUKICByb3duYW1lc190b19jb2x1bW4oInNhbXBsZSIpCgpmZWF0dXJlX2xhYmVscyA9IGMoInNhbXBsZSIpCgogbmFtZXMgPSB0YXhfdGFibGUocF9wcykgJT4lCiAgICAgIGRhdGEuZnJhbWUoKSAlPiUKICAgICAgbG93ZXN0X2xldmVsKCkgJT4lCiAgIHB1bGwobmFtZSkKCiBmZWF0dXJlX2xhYmVscyA9IGFwcGVuZChmZWF0dXJlX2xhYmVscywgbmFtZXMpCgogZmVhdHVyZV9sYWJlbHMgPSBtYWtlLnVuaXF1ZShmZWF0dXJlX2xhYmVscykgJT4lCiAgIG1ha2UubmFtZXMoKQoKIGNvbG5hbWVzKGZlYXR1cmVzKSA9IGZlYXR1cmVfbGFiZWxzCgpmZWF0dXJlcwpgYGAKIyMjIEF0dGFjaCBkYXRhIGxhYmVscwpgYGB7cn0Kc2FtcGxlX2xhYmVscyA9IHBfcHNAc2FtX2RhdGElPiUKICBhcy5kYXRhLmZyYW1lKCkgJT4lCiAgYXNfdGliYmxlKHJvd25hbWVzID0gTkEpICU+JQogIHJvd25hbWVzX3RvX2NvbHVtbigic2FtcGxlIikgJT4lCiAgc2VsZWN0KHNhbXBsZSwgZ3JvdXApCgpmZWF0dXJlcyA9IHNhbXBsZV9sYWJlbHMgJT4lCiAgbGVmdF9qb2luKGZlYXR1cmVzKQpgYGAKI1N1YnNhbXBsaW5nOiBTTU9URQoKYGBge3J9CnNtb3RlX2lucHV0ID0gZmVhdHVyZXMgJT4lCiAgc2VsZWN0KC1zYW1wbGUpICU+JQogIG11dGF0ZShncm91cCA9IGFzLmZhY3Rvcihncm91cCkpICU+JQogIGFzLmRhdGEuZnJhbWUoKQoKc3Vic2FtcGxlZCA9IFNNT1RFKGdyb3VwIH4gLiwgZGF0YSAgPSBzbW90ZV9pbnB1dCkKCnN1YnNhbXBsZWQlPiUKICBncm91cF9ieShncm91cCkgJT4lCiAgc3VtbWFyaXNlKGNvdW50ID0gbigpKQpgYGAKIyMjIFplcm8gdmFyaWFuY2UgZmVhdHVyZXMKCk5vIGZlYXR1cmVzIGhhdmUgemVybyB2YXJpYW5jZQpgYGB7cn0Kbnp2ID0gc3Vic2FtcGxlZCAlPiUKICBzZWxlY3QoIWdyb3VwKSAlPiUKICBuZWFyWmVyb1ZhcihzYXZlTWV0cmljcyA9IFRSVUUpCmBgYAoKYGBge3J9CnJtKG56dikKYGBgCgojIyMgQ29ycmVsYXRlZCBwcmVkaWN0b3JzCgpgYGB7cn0KZmVhdHVyZV9jb3IgPSBzdWJzYW1wbGVkICU+JQogc2VsZWN0KCFncm91cCkgJT4lCiBjb3IoKSAKCnN1bW1hcnkoZmVhdHVyZV9jb3JbdXBwZXIudHJpKGZlYXR1cmVfY29yKV0pCmBgYAoKYGBge3J9CmhpZ2hfY29yID0gZmluZENvcnJlbGF0aW9uKGZlYXR1cmVfY29yLCBjdXRvZmYgPSAuNzUpCgpmaWx0ZXJlZF9zdWJzYW1wbGVkID0gc3Vic2FtcGxlZFssLWhpZ2hfY29yXQpgYGAKCmBgYHtyfQpybShoaWdoX2NvcikKcm0oZmVhdHVyZV9jb3IpCmBgYAoKIyMjIExpbmVhciBkZXBlbmRlbmNpZXMKCmBgYHtyfQpmZWF0dXJlX2xkID0gZmlsdGVyZWRfc3Vic2FtcGxlZCAlPiUKICBzZWxlY3QoIWdyb3VwKSAlPiUKICBmaW5kTGluZWFyQ29tYm9zKCkKCmZpbHRlcmVkX3N1YnNhbXBsZWQgPSBmaWx0ZXJlZF9zdWJzYW1wbGVkWywgLSBmZWF0dXJlX2xkJHJlbW92ZV0KYGBgCgojIyBJbnB1dCB0YWJsZQoKUHJlcHJvY2Vzc2luZyByZWR1Y2VzIGNvbHVtbiBudW1iZXIgdG8gMTAyIGZyb20gMjM1CgpgYGB7cn0KaW5wdXQgPSBmaWx0ZXJlZF9zdWJzYW1wbGVkICU+JQogIG11dGF0ZShncm91cF9zaHVmZmxlID0gc2FtcGxlKGdyb3VwKSkgJT4lCiAgcm93bmFtZXNfdG9fY29sdW1uKCJzYW1wbGUiKQpgYGAKCiMgTW9kZWxpbmcKCkxvYWQgZnVuY3Rpb25zIGZyb20gY2xhc3NpZmljYXRpb25fZnVuY3Rpb25fQUEgZmlsZQoKYGBge3J9CmxhYmVsX2xpc3QgPSBjKCJncm91cCIsICJncm91cF9zaHVmZmxlIikKcmVzdWx0cyA9IGxvb3BfbGFiZWwoaXRlcmF0aW9ucyA9IDEwLCBpbnB1dCwgbGFiZWxfbGlzdCwgcGVyY2VudCA9IDAuNykKCmF1Y19kZiA9IHJlc3VsdHNbWzJdXQoKaW1wb3J0YW5jZV9kZiA9IHJlc3VsdHNbWzFdXQpgYGAKCmBgYHtyfQpybShsYWJlbF9saXN0KQpgYGAKCiMgUGxvdHRpbmcKCiMjIEFVQyBwbG90IGFuZCB0LXRlc3QKYGBge3J9CmF1Y19kZiAlPiUKICBwaXZvdF9sb25nZXIoY29scyA9IGMoImdyb3VwIiwgImdyb3VwX3NodWZmbGUiKSwgbmFtZXNfdG8gPSAiZmVhdHVyZSIsIHZhbHVlc190byA9ICJBVUMiKSAlPiUKICBnZ3Bsb3QoKSsKICBnZW9tX2JveHBsb3QoYWVzKHg9ZmVhdHVyZSwgeSA9IEFVQykpKwogIHRoZW1lX2NsYXNzaWMoKQoKdC50ZXN0KGF1Y19kZiRncm91cCwgYXVjX2RmJGdyb3VwX3NodWZmbGUpCgpgYGAKCiMjIEltcG9ydGFuY2UgcGxvdHMKCiMjIyBsb29rLXVwIHRhYmxlCmBgYHtyfQojTWFraW5nIGFzdiByZWZlcmVuY2UgZm9yIG5hbWVzIGluIG1vZGVsCmxvb2t1cF9hc3YgPSB0YXhfdGFibGUocF9wcykgJT4lIAogICAgIGRhdGEuZnJhbWUoKSAlPiUKICAgICAgbG93ZXN0X2xldmVsKCkgJT4lCiAgIHJvd25hbWVzX3RvX2NvbHVtbigiYXN2IikgJT4lIAogICBzZWxlY3QoYXN2LCBuYW1lKSAlPiUKICAgbXV0YXRlKG5hbWUgPSBtYWtlLm5hbWVzKG1ha2UudW5pcXVlKG5hbWUpKSkKYGBgCgojIyMgc3VtbWFyaXplIGltcG9ydGFuY2VzCmBgYHtyfQojIE1ha2luZyBzdW1tYXJ5IGRmIG9mIGltcG9ydGFuY2VzCmltcG9ydGFuY2Vfc3VtbWFyeSA9IGltcG9ydGFuY2VfZGYgJT4lCiAgcGl2b3RfbG9uZ2VyKGNvbHMgPSBjKCJIZWxpYW50aHVzLmFubnV1cyI6IlJoZXVtLnJoYXBvbnRpY3VtIiksIG5hbWVzX3RvID0gImZvb2QiLCB2YWx1ZXNfdG8gPSAiaW1wb3J0YW5jZSIpICU+JQogIGdyb3VwX2J5KHZhcmlhYmxlLCBmb29kKSAlPiUKICBzdW1tYXJpc2UobWVhbl9pbXBvcnRhbmNlID0gbWVhbihpbXBvcnRhbmNlKSkgJT4lCiAgYXJyYW5nZSh2YXJpYWJsZSwgZGVzYyhtZWFuX2ltcG9ydGFuY2UpKQpgYGAKIyMjIHRvcCAxMCByYW5rZWQgbGlzdApgYGB7cn0KIyBHZXR0aW5nIHRoZSByYW5rZWQgbGlzdApyYW5raW5nID0gaW1wb3J0YW5jZV9zdW1tYXJ5ICU+JQogIGZpbHRlcih2YXJpYWJsZSA9PSAiZ3JvdXBfc2h1ZmZsZSIpICU+JQogIHB1bGwoZm9vZCkKCnRvcDEwID0gcmFua2luZ1sxOjEwXQpgYGAKCiMjIyBJbXBvcnRhbmNlIHBsb3QKYGBge3J9CmltcG9ydGFuY2VfZGYgJT4lCiAgcGl2b3RfbG9uZ2VyKGNvbHMgPSBjKCJIZWxpYW50aHVzLmFubnV1cyI6IlJoZXVtLnJoYXBvbnRpY3VtIiksIG5hbWVzX3RvID0gImZvb2QiLCB2YWx1ZXNfdG8gPSAiaW1wb3J0YW5jZSIpICAlPiUKICBmaWx0ZXIodmFyaWFibGUgPT0gImdyb3VwX3NodWZmbGUiICYgZm9vZCAlaW4lIHRvcDEwKSAlPiUKICBnZ3Bsb3QoKSsKICBnZW9tX2JveHBsb3QoYWVzKHg9IHJlb3JkZXIoZm9vZCwtaW1wb3J0YW5jZSksIHkgPSBpbXBvcnRhbmNlKSkrCiAgbGFicyh4PSAiRm9vZCIsIHkgPSJJbXBvcnRhbmNlIikgKwogIHRoZW1lX2NsYXNzaWMoKSArCiAgdGhlbWUoYXhpcy50ZXh0LnggPSBlbGVtZW50X3RleHQoYW5nbGUgPSA0NSwgaGp1c3Q9MSkpCmBgYAoKU29tZSBvZiB0aGUgaW1wb3J0YW50IE5BcyBmb3IgcHJlZGljdGlvbnMgb25seSBtYXAgdG8gYmFjdGVyaWFsIHNwZWNpZXMgaW4gQkxBU1QuCgojIEltcG9ydGFudCB0YXhhIGRpcmVjdGlvbnMKCmBgYHtyfQpzYW1wbGVfZGF0YShwX3BzKSAlPiUKICBkYXRhLmZyYW1lKCkgJT4lCiAgZmlsdGVyKHRpbWVwb2ludCA9PSAiRW50cnkiKSAlPiUKICBnZ3Bsb3QoKSArCiAgZ2VvbV9ib3hwbG90KGFlcyh4PSBncm91cCwgeSA9IGJtaSkpKwogIHRoZW1lX2NsYXNzaWMoKQoKcGxvdF9mb29kX2RpcmVjdGlvbih0b3AxMCwgZmVhdHVyZXMpCmBgYAoKCmBgYHtyfQpsb29rdXBfYXN2ICU+JQogIGZpbHRlcihuYW1lID09ICJUaGVvYnJvbWEuY2FjYW8iKSAlPiUKICBwdWxsKGFzdikKYGBgCgoK